###################################################
### Project: Issue Competition in Parliamentary ###
###          Speeches?                          ###
### Title:   Create Corpus                      ###
### Author:  Christoph Ivanusch                 ###
###################################################

# Preparation

## clear global environment
rm(list = ls())

## load packages
library(dplyr)
library(tidyr)
library(stringi)
library(stringr)

## load Corpus by Rauh & Schwalbach
## needs to be downloaded from: "https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/L4OAKN"
Corp_Nationalrat_V2 <- readRDS("Corp_Nationalrat_V2.RDS")

## load Corpus with Speeches from 2019 (self-created)
Corp_Nationalrat_2019 <- readRDS("Corp_Nationalrat_20190101_20191022.RDS")

# Filter Corpus by Rauh & Schwalbach

## filter by date
df_20021220_20181231 <- filter(Corp_Nationalrat_V2, date >= "2002-12-20")

## remove texts by chairs
df_no_chair <- filter(df_20021220_20181231, chair == FALSE)

## remove speeches, which are not given by MPS
speakers <- df_no_chair$speaker

position <- c()
position <- lapply(seq_along(speakers), function(s){
  
  if(stri_detect(speakers[s], fixed = "Bundesminister") == TRUE){
    position[s] <- "BM"
  } else if(stri_detect(speakers[s], fixed = "Bundesministerin") == TRUE){
    position[s] <- "BM"
  } else if(stri_detect(speakers[s], fixed = "Bundeskanzler") == TRUE){
    position[s] <- "BK"
  } else if(stri_detect(speakers[s], fixed = "Bundeskanzlerin") == TRUE){
    position[s] <- "BK"
  } else if(stri_detect(speakers[s], fixed = "Schriftführer") == TRUE){
    position[s] <- "SF"
  } else if(stri_detect(speakers[s], fixed = "Schriftführerin") == TRUE){
    position[s] <- "SF"
  } else {
    position[s] <- "MP"
  }
}) %>% unlist


df_no_chair <- cbind(df_no_chair, position)
df_mps_only <- filter(df_no_chair, position == "MP")
data_2002_2018 <- select(df_mps_only, date, speaker, party, text)

data_2019 <- select(Corp_Nationalrat_2019, date, speaker, party, text)

Corp_Nationalrat_20021220_20191022 <- rbind(data_2002_2018, data_2019)

## add info on legislative period (Gesetzgebungsperiode)
dates <- Corp_Nationalrat_20021220_20191022$date

gp <- c()
gp <- lapply(seq_along(dates), function(d) {
  if(dates[d] >= "2002-12-20" & dates[d] <= "2006-10-29"){
    gp[d] <- "22"
  } else if(dates[d] >= "2006-10-30" & dates[d] <= "2008-10-27"){
    gp[d] <- "23"
  } else if(dates[d] >= "2008-10-28" & dates[d] <= "2013-10-28"){
    gp[d] <- "24"
  } else if(dates[d] >= "2013-10-29" & dates[d] <= "2017-11-08"){
    gp[d] <- "25"
  } else if (dates[d] >= "2017-11-09" & dates[d] <= "2019-10-22"){
    gp[d] <- "26"
  }
}) %>% unlist


# Create and save corpus

Corp_Nationalrat_20021220_20191022 <- cbind(gp, Corp_Nationalrat_20021220_20191022)

saveRDS(Corp_Nationalrat_20021220_20191022, "Corp_Nationalrat_20021220_20191022.RDS")
